import pandas as pd
import re

filename = '../../Data/zhanghao/music.csv'
output_filename = 'top_10_high_rating_music.csv'

# 读取数据并选择包含评分、评论人数和书名的三列
df = pd.read_csv(filename, usecols=['grade', 'count', 'music_name'])

# 去重操作
df = df.drop_duplicates(subset=['music_name'], keep='first')

# 处理评论人数列
df['count'] = df['count'].str.replace('人评价', '')
df['count'] = df['count'].apply(lambda x: re.findall(r'\d+', x)[0] if re.findall(r'\d+', x) else '10')
df['count'] = df['count'].astype(int)

# 筛选出评论人数大于2000的行
df = df[df['count'] > 2000]

# 将评分列转换为数值类型，并按照评分从高到低排序
df['grade'] = pd.to_numeric(df['grade'], errors='coerce')
sorted_df = df.sort_values(by='grade', ascending=False)

# 获取评分最高的前十本书的信息
top_10_books = sorted_df.head(10)


# 将结果保存为 CSV 文件
top_10_books.to_csv(output_filename, index=False)
